{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import malaya\n",
    "from malaya.dictionary import *\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65823"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malay_words = MALAY_WORDS | CAMBRIDGE_MALAY_WORDS | KAMUS_DEWAN_WORDS | DBP_WORDS\n",
    "malay_words |= negeri | city | country | daerah | parlimen | adun\n",
    "len(malay_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "234177"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_words = ENGLISH_WORDS\n",
    "len(english_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "231316"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_minus = english_words - malay_words\n",
    "len(english_minus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "231117"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_minus = {i for i in english_minus if 'haha' not in i and i != 'rt' and 'yeay' not in i and\\\n",
    "                'yes' not in i and 'ooo' not in i and 'insha' not in i and 'huhu' not in i and\\\n",
    "                'insya' not in i and 'hew' not in i and 'uwuu' not in i and\\\n",
    "                 'wkwk' not in i and 'hoho' not in i and 'babi' not in i and\\\n",
    "                 'meow' not in i and 'aiii' not in i and 'alham' not in i and 'mashaa' not in i\\\n",
    "                 and i not in ['takda', 'cer']}\n",
    "len(english_minus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def language_detection_textcleaning(string):\n",
    "    string = re.sub(\n",
    "        'http\\\\S+|www.\\\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "\n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "    string = string.replace('\\n', '').replace('\\t', '')\n",
    "\n",
    "    string = re.sub(\n",
    "        '[0-9!@#$%^&*()_\\\\-+{}|\\\\~`\\'\";:?/.>,<]', ' ', string, flags=re.UNICODE\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "\n",
    "    return string.lower()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split(data):\n",
    "    t = language_detection_textcleaning(data)\n",
    "    splitted = t.split()\n",
    "    ngs = set(splitted)\n",
    "    return ngs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!head -n 100 filter-malay-rojak-rojak.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# !pip3.8 install lingua-language-detector\n",
    "# from lingua import Language, LanguageDetectorBuilder\n",
    "# languages = [Language.ENGLISH, Language.CHINESE, Language.MALAY]\n",
    "# detector = LanguageDetectorBuilder.from_languages(*languages).build()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "language_detection = malaya.language_detection.fasttext(model = 'mesolitica/fasttext-language-detection-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['rojak']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "language_detection.predict(['share kalau nak tgk k view wea'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "ms_open = open('filter-malay-rojak-ms.jsonl', 'w')\n",
    "en_open = open('filter-malay-rojak-en.jsonl', 'w')\n",
    "rojak_open = open('filter-malay-rojak-rojak.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "13573586it [13:18, 19665.78it/s]IOPub message rate exceeded.\n",
      "The notebook server will temporarily stop sending output\n",
      "to the client in order to avoid crashing it.\n",
      "To change this limit, set the config variable\n",
      "`--NotebookApp.iopub_msg_rate_limit`.\n",
      "\n",
      "Current values:\n",
      "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
      "NotebookApp.rate_limit_window=3.0 (secs)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "with open('prepare-malay-text-ms.jsonl') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        \n",
    "        data = json.loads(l)\n",
    "        if language_detection.predict([data])[0] not in {'rojak', 'malay'}:\n",
    "            continue\n",
    "        t = language_detection_textcleaning(data)\n",
    "        if not len(t):\n",
    "            continue\n",
    "            \n",
    "        if False:\n",
    "            f = en_open\n",
    "        else:\n",
    "            splitted = t.split()\n",
    "            ngs = set(splitted)\n",
    "            len_en = len(ngs & english_minus)\n",
    "            if len_en == len(ngs):\n",
    "                f = en_open\n",
    "            elif len_en:\n",
    "                f = rojak_open\n",
    "            else:\n",
    "                f = ms_open\n",
    "        \n",
    "        f.write(f'{json.dumps(t)}\\n')\n",
    "        f.flush()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "ms_open.close()\n",
    "en_open.close()\n",
    "rojak_open.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
